## Run ConnectIPL.sh
# set path
setwd("~/NAS/projects/swisspilot/")
dir()

# load data
d <- read.table("ETHZ-Verteilungen.txt",header = TRUE,sep=",")

# flag for no wunschcanton
d$nowunsch <- d$KTV_PZ_WUNSCH_KT_KZ == " " | d$KTV_PZ_WUNSCH_KT_KZ == "!"

# flag for any text in comment field
d$KTV_KT_KZ <- as.character(d$KTV_KT_KZ)
d$KTV_PZ_WUNSCH_KT_KZ <- as.character(d$KTV_PZ_WUNSCH_KT_KZ)
d$KTV_BEM <- as.character(d$KTV_BEM)

d$comment <- as.character(d$KTV_PZ_WUNSCH_BEM)
d$comment100 <- substr(d$KTV_PZ_WUNSCH_BEM,1,100)

d$anytext <- grepl("[[:alpha:]]", d$comment)
d$anytext2 <- grepl("[[:alpha:]]", d$KTV_BEM)

# flag for wunsch canton granted
d$variable_canton_match <- d$KTV_KT_KZ==d$KTV_PZ_WUNSCH_KT_KZ

# flag for comment containing the word Dublin
d$dublin <- grepl("Dublin|DUBLIN|Dubllin|Dubin|dublin", d$KTV_PZ_WUNSCH_BEM)


# flag for comment containing the word Zentrale
d$zentrale <- grepl("Zentrale|zentrale|ZENTRALE|Zentral|zentral|ZENTRAL|central|CENTRALE|centrale", d$KTV_PZ_WUNSCH_BEM)

# replace canton names with abbreviation
d$comment_canton_abbrivated <- d$KTV_PZ_WUNSCH_BEM
d$comment_canton_abbrivated <- gsub("Z??rich|Z\374rich|Zurich","ZH",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Bern|Berne","BE",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Luzern|Lucerne","LU",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Uri","UR",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Schwyz|Schwytz","SZ",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Obwalden|Obwald","OW",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Nidwalden|Nidwald","NW",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Glarus|Glaris","GL",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Zug|Zoug","ZG",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Freiburg|Fribourg","FR",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Solothurn|Soleure","SO",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Basel Stadt|Basel-Stadt|B??le-Ville|B\342lle-Ville|Bale-Ville","BS",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Basel Land|Basel-Land|Basel-Landschaft|Bale Campagne|Bale-Campagne","BL",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Schaffhausen|Schaffhouse","SH",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Appenzell Ausserrhoden|Appenzell Rhodes-Ext??rieures|Appenzell Rhodes-Ext\351rieures|Appenzell Rhodes-Exterieures","AR",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Appenzell Innerrhoden|Appenzell Rhodes-Int??rieures|Appenzell Rhodes-Int\351rieures|Appenzell Rhodes-Interieures","AI",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Sankt Gallen|St. Gallen|Saint Gall","SG",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Graub??nden|Graub\374nden|Grison","GR",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Aargau|Argovie","AG",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Thurgau|Thurgovie","TG",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Tessin|Ticino","TI",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Vaud|Waadt","VD",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Valais|Wallis","VS",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Neuch??tel|Neuchatel|Neuch\342tel|Neuenburg","NE",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Gen??ve|Gen\350ve|Genf|Genf","GE",d$comment_canton_abbrivated)
d$comment_canton_abbrivated <- gsub("Jura","JU",d$comment_canton_abbrivated)

# assigned canton in comment
d$comment_canton_match <- NA
for(i in 1:nrow(d)){
  d$comment_canton_match[i] <- grepl(d$KTV_KT_KZ[i], d$comment_canton_abbrivated[i])
  }

# recode language region
canton_abb <- unique(d$KTV_KT_KZ)
d$comment_any_canton <- grepl("VD|AG|FR|TG|ZH|GR|BL|BE|BS|JU|SG|LU|TI|SO|VS|NW|SZ|GE|OW|NE|SH|ZG|GL|AR|UR|AI",d$comment_canton_abbrivated)
d$comment_any_canton_deutschspr <- grepl("ZH|BE|LU|UR|SZ|OW|NW|GL|ZG|FR|SO|BS|BL|SH|AR|AI|SG|GR|AG|TG|VS",d$comment_canton_abbrivated)
d$comment_any_canton_franzspr <- grepl("FR|TI|VD|VS|NE|GE|JU",d$comment_canton_abbrivated)

# some descriptive stats
mean(d$comment_canton_match)
mean(d$comment_canton_match[d$anytext==1] )
mean(d$comment_canton_match[d$anytext==1 & d$nowunsch==1] )
mean(d$comment_canton_match[d$anytext==1 & d$nowunsch==0] )

# code if comment contained German-speaking canton and assignment was to German-speaking canton
d$comment_deutschspr_match <- 0
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="ZH"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="BE"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="LU"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="UR"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="SZ"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="OW"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="NW"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="GL"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="ZG"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="FR"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="SO"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="BS"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="BL"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="SH"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="AR"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="AI"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="SG"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="GR"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="AG"] <- 1
d$comment_deutschspr_match[d$comment_deutschspr==1 & d$KTV_KT_KZ=="TG"] <- 1
mean(d$comment_deutschspr_match[d$comment_deutschspr==1])

# code if comment contained French-speaking canton and assignment was to French-speaking canton
d$comment_franzspr_match <- 0
d$comment_franzspr_match[d$comment_franzspr==1 & d$KTV_KT_KZ=="BE"] <- 1
d$comment_franzspr_match[d$comment_franzspr==1 & d$KTV_KT_KZ=="FR"] <- 1
d$comment_franzspr_match[d$comment_franzspr==1 & d$KTV_KT_KZ=="TI"] <- 1
d$comment_franzspr_match[d$comment_franzspr==1 & d$KTV_KT_KZ=="VD"] <- 1
d$comment_franzspr_match[d$comment_franzspr==1 & d$KTV_KT_KZ=="NE"] <- 1
d$comment_franzspr_match[d$comment_franzspr==1 & d$KTV_KT_KZ=="GE"] <- 1
d$comment_franzspr_match[d$comment_franzspr==1 & d$KTV_KT_KZ=="JU"] <- 1
mean(d$comment_franzspr_match[d$comment_franzspr==1])

library(stringi)
# what percent contain a designation to a language region or set of cantons
d$comlower <- stri_trans_tolower(d$KTV_PZ_WUNSCH_BEM)
d$comment_deutschspr <- grepl("canton al??manique|allemand|deutschschweiz|deutschsprach|sprach| d | d\\.$| d\\,| d$|de$| de\\,\\.|deutsch| d\\.\\,|^d\\,|^d\\.",d$comlower)
d$comment_franzspr <- grepl("canton francophone|^westschweiz|canton romand|ct francophone| f | f\\.$| f\\,| f$| f\\.\\,|^f\\,|^f\\.",d$comlower)

# generate free cases and recove if text field matches assigned language region
d$freecase <- 1
d$freecase[d$variable_canton_match==1] <- 0
d$freecase[d$comment_canton_match==1] <- 0
d$freecase[d$comment_deutschspr_match==1] <- 0
d$freecase[d$comment_franzspr_match==1] <- 0

d$KTV_year = stri_sub(as.character(d$KTV_D), -4, -1)

# some descriptive stats
mean(d$freecase)
mean(d$freecase[d$KTV_year==2007])
mean(d$freecase[d$KTV_year==2008])
mean(d$freecase[d$KTV_year==2009])
mean(d$freecase[d$KTV_year==2017])
mean(d$freecase[d$KTV_year==2018])


# Save data ---------------------------------------------------------------
dat.comments <- d[,c("PERS_ID", "KTV_KT_KZ", "KTV_D", "KTV_year", "KTV_PZ_WUNSCH_KT_KZ", "nowunsch", "anytext", "variable_canton_match", "dublin", "comment_canton_match", "comment_deutschspr", "comment_franzspr","zentrale","comment_any_canton", "freecase")]
setwd("/Users/hadomini/Dropbox (PP)/ZEMIS/Data/")
write.table(dat.comments , file = "comments.txt", fileEncoding="UTF8", row.names = F)



